Importing libraries

library(ggplot2)
library(MASS) # for the example dataset 
library(plyr) # for recoding data
library(ROCR) # for plotting roc
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library(e1071) # for NB and SVM
library(rpart) # for decision tree
library(tree)
library(ada) # for adaboost
library(class)#for KNN
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following object is masked from 'package:MASS':
## 
##     select
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(caret)
## Loading required package: lattice
library(forecast)
library(tseries)
library(leaflet)

PREPROCESSING

read booking information

a_booked_time=read.csv("air_reserve.csv",header=TRUE, sep=",") # store_id, bookingdatetime, visitdatetime
#summary(a_booked_time) 

load calendar (holiday dataset)

cal=read.csv("date_info.csv",header=TRUE, sep=",")
head(cal)
##   calendar_date day_of_week holiday_flg
## 1    2016-01-01      Friday           1
## 2    2016-01-02    Saturday           1
## 3    2016-01-03      Sunday           1
## 4    2016-01-04      Monday           0
## 5    2016-01-05     Tuesday           0
## 6    2016-01-06   Wednesday           0

Changing all date_time information to date only, as we do not have enough information to do hourly prediction

visitDateTime=as.factor(a_booked_time$visit_datetime)
visitDate=substr(visitDateTime, start = 1, stop = 10)
reserveDateTime=as.factor(a_booked_time$reserve_datetime)
reserveDate=substr(reserveDateTime, start = 1, stop = 10)
visitDate=as.Date(visitDate)
reserveDate=as.Date(reserveDate)
a_booked=a_booked_time
a_booked$visit_datetime=visitDate
a_booked$reserve_datetime=reserveDate
head(a_booked)
##           air_store_id visit_datetime reserve_datetime reserve_visitors
## 1 air_877f79706adbfb06     2016-01-01       2016-01-01                1
## 2 air_db4b38ebe7a7ceff     2016-01-01       2016-01-01                3
## 3 air_db4b38ebe7a7ceff     2016-01-01       2016-01-01                6
## 4 air_877f79706adbfb06     2016-01-01       2016-01-01                2
## 5 air_db80363d35f10926     2016-01-01       2016-01-01                5
## 6 air_db80363d35f10926     2016-01-02       2016-01-01                2

importing store location information

a_store_addr=read.csv("air_store_info.csv",header=TRUE, sep=",")
head(a_store_addr)
##           air_store_id air_genre_name                air_area_name
## 1 air_0f0cdeee6c9bf3d7 Italian/French Hyōgo-ken Kōbe-shi Kumoidōri
## 2 air_7cc17a324ae5c7dc Italian/French Hyōgo-ken Kōbe-shi Kumoidōri
## 3 air_fee8dcf4d619598e Italian/French Hyōgo-ken Kōbe-shi Kumoidōri
## 4 air_a17f0778617c76e2 Italian/French Hyōgo-ken Kōbe-shi Kumoidōri
## 5 air_83db5aff8f50478e Italian/French Tōkyō-to Minato-ku Shibakōen
## 6 air_99c3eae84130c1cb Italian/French Tōkyō-to Minato-ku Shibakōen
##   latitude longitude
## 1 34.69512  135.1979
## 2 34.69512  135.1979
## 3 34.69512  135.1979
## 4 34.69512  135.1979
## 5 35.65807  139.7516
## 6 35.65807  139.7516
dim(a_store_addr)
## [1] 829   5

importing store visits per day information

a_visited=read.csv("air_visit_data.csv",header=TRUE, sep=",")
#head(a_visited)
dim(a_visited)
## [1] 252108      3
head(a_visited)
##           air_store_id visit_date visitors
## 1 air_ba937bf13d40fb24 2016-01-13       25
## 2 air_ba937bf13d40fb24 2016-01-14       32
## 3 air_ba937bf13d40fb24 2016-01-15       29
## 4 air_ba937bf13d40fb24 2016-01-16       22
## 5 air_ba937bf13d40fb24 2016-01-18        6
## 6 air_ba937bf13d40fb24 2016-01-19        9

Data Analysis

plotting number of visitors for all the restaurants togather throughout the time period

a_visited_cal=merge(a_visited,cal, by.x="visit_date",by.y="calendar_date")
visitDate=(a_visited$visit_date)
visitDate=toString(visitDate)
a_visited$visit_date=as.Date(a_visited$visit_date, format = "%Y-%m-%d")
plot(a_visited$visit_date)

ggplot(a_visited, aes(x =visit_date,y=visitors))+geom_point()

analysing cusine information

#plot(a_store_addr, x = air_area_name)
ggplot(a_store_addr, aes(x = air_genre_name)) + geom_bar()+facet_wrap(~air_genre_name, nrow = 2)+theme(axis.text.x=element_blank(),
        axis.ticks.x=element_blank())

summary(a_store_addr$air_genre_name)
##                        Asian                 Bar/Cocktail 
##                            2                           79 
##                  Cafe/Sweets             Creative cuisine 
##                          181                           13 
##                   Dining bar        International cuisine 
##                          108                            2 
##               Italian/French                      Izakaya 
##                          102                          197 
##                Japanese food                Karaoke/Party 
##                           63                            2 
## Okonomiyaki/Monja/Teppanyaki                        Other 
##                           14                           27 
##                 Western food         Yakiniku/Korean food 
##                           16                           23
dim(a_store_addr)
## [1] 829   5

Conclusion:

  • Asian, international cuisine, karoke/party should be removed as main category and could be clubbed with other
  • Boosting method might be good because classes are so imbalanced.
head(a_store_addr)
##           air_store_id air_genre_name                air_area_name
## 1 air_0f0cdeee6c9bf3d7 Italian/French Hyōgo-ken Kōbe-shi Kumoidōri
## 2 air_7cc17a324ae5c7dc Italian/French Hyōgo-ken Kōbe-shi Kumoidōri
## 3 air_fee8dcf4d619598e Italian/French Hyōgo-ken Kōbe-shi Kumoidōri
## 4 air_a17f0778617c76e2 Italian/French Hyōgo-ken Kōbe-shi Kumoidōri
## 5 air_83db5aff8f50478e Italian/French Tōkyō-to Minato-ku Shibakōen
## 6 air_99c3eae84130c1cb Italian/French Tōkyō-to Minato-ku Shibakōen
##   latitude longitude
## 1 34.69512  135.1979
## 2 34.69512  135.1979
## 3 34.69512  135.1979
## 4 34.69512  135.1979
## 5 35.65807  139.7516
## 6 35.65807  139.7516
xy=c(levels(a_store_addr$air_genre_name))
a_store_addr$air_genre_name[which(a_store_addr$air_genre_name %in% c("Asian", "Karaoke/Party", "International cuisine"))] <- factor("Other")
 a_store_addr$air_genre_name=factor(a_store_addr$air_genre_name)
summary(a_store_addr$air_genre_name)
##                 Bar/Cocktail                  Cafe/Sweets 
##                           79                          181 
##             Creative cuisine                   Dining bar 
##                           13                          108 
##               Italian/French                      Izakaya 
##                          102                          197 
##                Japanese food Okonomiyaki/Monja/Teppanyaki 
##                           63                           14 
##                        Other                 Western food 
##                           33                           16 
##         Yakiniku/Korean food 
##                           23
ggplot(a_store_addr, aes(x = air_genre_name)) + geom_bar()+facet_wrap(~air_genre_name, nrow = 2)+theme(axis.text.x=element_blank(),
        axis.ticks.x=element_blank())

analysing number of visitors v/s day of the week analysing number of visitors v/s if it is a holiday

ggplot(a_visited_cal, aes(x = visitors)) + geom_density(aes(group=day_of_week, fill=day_of_week), alpha=.5)+  xlim(0, 150)
## Warning: Removed 76 rows containing non-finite values (stat_density).

day.comp = aov(visitors ~ day_of_week, data =a_visited_cal ) 
summary(day.comp)
##                 Df   Sum Sq Mean Sq F value Pr(>F)    
## day_of_week      6  2670533  445089    1647 <2e-16 ***
## Residuals   252101 68120431     270                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
pairwise.t.test(a_visited_cal$visitors, a_visited_cal$day_of_week)
## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  a_visited_cal$visitors and a_visited_cal$day_of_week 
## 
##           Friday  Monday  Saturday Sunday  Thursday Tuesday
## Monday    < 2e-16 -       -        -       -        -      
## Saturday  < 2e-16 < 2e-16 -        -       -        -      
## Sunday    5e-10   < 2e-16 < 2e-16  -       -        -      
## Thursday  < 2e-16 < 2e-16 < 2e-16  < 2e-16 -        -      
## Tuesday   < 2e-16 0.00018 < 2e-16  < 2e-16 < 2e-16  -      
## Wednesday < 2e-16 < 2e-16 < 2e-16  < 2e-16 0.01055  < 2e-16
## 
## P value adjustment method: holm
  • Seems like day of the week matters with the tests. ### Importing weather data. Analyzed randomly see the weather dataset( 1663 files of 1663 areas). More than 60% of the data had 6 variables: avg_temp, min_temp, max_temp, precipitation, hour_sunlight, wind speed. To tried to only extract that.
temp = list.files(path="./weather/",pattern="*.csv")
temp2=temp
temp2=paste("./weather/",temp, sep='')
n=dim(as.data.frame(temp))[1]
  w.data=read.csv(temp2[1], header = TRUE, sep=",")
  data.prac=data.frame(w.data$calendar_date)
  data.mint=data.frame(w.data$calendar_date)
  data.maxt=data.frame(w.data$calendar_date)
  data.avgt=data.frame(w.data$calendar_date)
  data.hrsun=data.frame(w.data$calendar_date)
  data.windsp=data.frame(w.data$calendar_date)
for(i in 1:n){
  w.data=read.csv(temp2[i], header = TRUE, sep=",")
  row.names(w.data)
  data.prac =cbind(data.prac, w.data$precipitation)
  data.mint=cbind(data.mint, w.data$low_temperature)
  data.maxt =cbind(data.maxt, w.data$high_temperature)
  data.avgt =cbind(data.prac, w.data$avg_temperature)
  data.hrsun =cbind(data.prac, w.data$hours_sunlight)
  data.windsp =cbind(data.prac, w.data$avg_wind_speed)

  names(data.prac)[names(data.prac)=="w.data$precipitation"] <- temp[i]
  names(data.mint)[names(data.mint)=="w.data$low_temprature"] <- temp[i]
  names(data.maxt)[names(data.prac)=="w.data$high_temprature"] <- temp[i]
  names(data.avgt)[names(data.prac)=="w.data$avg_temprature"] <- temp[i]
  names(data.hrsun)[names(data.prac)=="w.data$hours_sunlight"] <- temp[i]
  names(data.windsp)[names(data.prac)=="w.data$avg_wind_speed"] <- temp[i]
}
head(a_visited_cal)
##   visit_date         air_store_id visitors day_of_week holiday_flg
## 1 2016-01-01 air_c31472d14e29cee8        3      Friday           1
## 2 2016-01-01 air_05c325d315cc17f5       29      Friday           1
## 3 2016-01-01 air_81c5dff692063446        7      Friday           1
## 4 2016-01-01 air_a083834e7ffe187e       27      Friday           1
## 5 2016-01-01 air_36bcf77d3382d36e       34      Friday           1
## 6 2016-01-01 air_09a845d5b5944b01       56      Friday           1
ggplot(a_visited_cal, aes(x = visitors)) + geom_density(aes(group=holiday_flg, fill=holiday_flg), alpha=.5)+  xlim(0, 150)
## Warning: Removed 76 rows containing non-finite values (stat_density).

day.comp = aov(visitors ~ holiday_flg, data =a_visited_cal ) 
summary(day.comp)
##                 Df   Sum Sq Mean Sq F value Pr(>F)    
## holiday_flg      1   100261  100261   357.6 <2e-16 ***
## Residuals   252106 70690702     280                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
pairwise.t.test(a_visited_cal$visitors, a_visited_cal$holiday_flg)
## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  a_visited_cal$visitors and a_visited_cal$holiday_flg 
## 
##   0     
## 1 <2e-16
## 
## P value adjustment method: holm

Seems like holiday information is also significantly useful with t test.

Mapping of the restaurants. This visual would make it easier to see the areas where the most of the restaurants are located. As can be seen from the map, there are top three areas: Tokyo, Kyoto and Osaka

map_air <- leaflet(a_store_addr) %>% addTiles('http://{s}.basemaps.cartocdn.com/dark_all/{z}/{x}/{y}.png') 
map_air %>% addCircles(~longitude, ~latitude, popup=a_store_addr$air_area_name, weight = 3, radius=40, 
                 color="#ffa500", stroke = TRUE, fillOpacity = 0.8)
ggplot(a_visited_cal, aes(x = visitors)) + stat_ecdf()+scale_x_continuous(name="No of visitors",breaks=c(0,5,7,9,13,15,17,20,22,25,30,35,40,50,60,70,100,150), limits=c(0,150))+scale_y_continuous(name="cdf",breaks=c(0,0.2,0.4,0.6,0.8,1.0), limits=c(0,1))
## Warning: Removed 76 rows containing non-finite values (stat_ecdf).

summary(a_visited_cal$visitors)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    9.00   17.00   20.97   29.00  877.00

So, if we devide the dataset into 5 classes, our classes would be: Class1: no of visitors betwen 0-6, size of 7 Class2: no of visitors betwen 7-14, size of 8 Class3: no of visitors betwen 14-22, size of 8 Class4: no of visitors betwen 23-35, size of 13 Class5: no of visitors betwen 35 and more The median is 17. And we devide the data on 17 if we use ony 2 classes to keep the data balanced.

head(a_store_addr)
##           air_store_id air_genre_name                air_area_name
## 1 air_0f0cdeee6c9bf3d7 Italian/French Hyōgo-ken Kōbe-shi Kumoidōri
## 2 air_7cc17a324ae5c7dc Italian/French Hyōgo-ken Kōbe-shi Kumoidōri
## 3 air_fee8dcf4d619598e Italian/French Hyōgo-ken Kōbe-shi Kumoidōri
## 4 air_a17f0778617c76e2 Italian/French Hyōgo-ken Kōbe-shi Kumoidōri
## 5 air_83db5aff8f50478e Italian/French Tōkyō-to Minato-ku Shibakōen
## 6 air_99c3eae84130c1cb Italian/French Tōkyō-to Minato-ku Shibakōen
##   latitude longitude
## 1 34.69512  135.1979
## 2 34.69512  135.1979
## 3 34.69512  135.1979
## 4 34.69512  135.1979
## 5 35.65807  139.7516
## 6 35.65807  139.7516
dim(a_store_addr)
## [1] 829   5

s Adding all the reservations until taday for next week.

air_vis1 <- a_booked %>%
  filter((as.Date(visit_datetime)-as.Date(reserve_datetime))>7) %>%
  group_by(visit_datetime,air_store_id) %>%
  summarize(vis=sum(reserve_visitors))
colnames(air_vis1)[colnames(air_vis1)=="visit_datetime"] <- "visit_date"
colnames(air_vis1)[colnames(air_vis1)=="vis"] <- "bookingsTillLastWeek"
head(air_vis1)
## # A tibble: 6 x 3
##   visit_date air_store_id         bookingsTillLastWeek
##   <date>     <fct>                               <int>
## 1 2016-01-09 air_54ed43163b7596c4                    5
## 2 2016-01-09 air_ee3a01f0c71a769f                    3
## 3 2016-01-11 air_ee3a01f0c71a769f                    2
## 4 2016-01-12 air_24b9b2a020826ede                    3
## 5 2016-01-12 air_3c05c8f26c611eb9                   10
## 6 2016-01-12 air_6d65542aa43b598b                    2

Or Adding all the reservations(not for predicting future)

air_vis2 <- a_booked %>%
  group_by(visit_datetime,air_store_id) %>%
  summarize(vis=sum(reserve_visitors))
head(air_vis2)
## # A tibble: 6 x 3
## # Groups:   visit_datetime [2]
##   visit_datetime air_store_id           vis
##   <date>         <fct>                <int>
## 1 2016-01-01     air_877f79706adbfb06     3
## 2 2016-01-01     air_db4b38ebe7a7ceff     9
## 3 2016-01-01     air_db80363d35f10926     5
## 4 2016-01-02     air_2b8b29ddfd35018e     6
## 5 2016-01-02     air_3bb99a1fe0583897     4
## 6 2016-01-02     air_6b15edd1b4fbb96a    11

Merging booking done one week in advance with the actual visits.

head(air_vis1)
## # A tibble: 6 x 3
##   visit_date air_store_id         bookingsTillLastWeek
##   <date>     <fct>                               <int>
## 1 2016-01-09 air_54ed43163b7596c4                    5
## 2 2016-01-09 air_ee3a01f0c71a769f                    3
## 3 2016-01-11 air_ee3a01f0c71a769f                    2
## 4 2016-01-12 air_24b9b2a020826ede                    3
## 5 2016-01-12 air_3c05c8f26c611eb9                   10
## 6 2016-01-12 air_6d65542aa43b598b                    2
head(a_visited)
##           air_store_id visit_date visitors
## 1 air_ba937bf13d40fb24 2016-01-13       25
## 2 air_ba937bf13d40fb24 2016-01-14       32
## 3 air_ba937bf13d40fb24 2016-01-15       29
## 4 air_ba937bf13d40fb24 2016-01-16       22
## 5 air_ba937bf13d40fb24 2016-01-18        6
## 6 air_ba937bf13d40fb24 2016-01-19        9
airBV=merge(air_vis1, a_visited, all=TRUE)
head(airBV)
##   visit_date         air_store_id bookingsTillLastWeek visitors
## 1 2016-01-01 air_04341b588bde96cd                   NA       10
## 2 2016-01-01 air_70e9e8cd55879414                   NA        4
## 3 2016-01-01 air_7cc17a324ae5c7dc                   NA        2
## 4 2016-01-01 air_81c5dff692063446                   NA        7
## 5 2016-01-01 air_877f79706adbfb06                   NA        3
## 6 2016-01-01 air_87f9e1024b951f01                   NA       17
airBV$visitors[is.na(airBV$visitors)] <- 0
head(airBV)
##   visit_date         air_store_id bookingsTillLastWeek visitors
## 1 2016-01-01 air_04341b588bde96cd                   NA       10
## 2 2016-01-01 air_70e9e8cd55879414                   NA        4
## 3 2016-01-01 air_7cc17a324ae5c7dc                   NA        2
## 4 2016-01-01 air_81c5dff692063446                   NA        7
## 5 2016-01-01 air_877f79706adbfb06                   NA        3
## 6 2016-01-01 air_87f9e1024b951f01                   NA       17
airBV$bookingsTillLastWeek[is.na(airBV$bookingsTillLastWeek)] <- 0
head(airBV)
##   visit_date         air_store_id bookingsTillLastWeek visitors
## 1 2016-01-01 air_04341b588bde96cd                    0       10
## 2 2016-01-01 air_70e9e8cd55879414                    0        4
## 3 2016-01-01 air_7cc17a324ae5c7dc                    0        2
## 4 2016-01-01 air_81c5dff692063446                    0        7
## 5 2016-01-01 air_877f79706adbfb06                    0        3
## 6 2016-01-01 air_87f9e1024b951f01                    0       17
airBV[0:50,]
##    visit_date         air_store_id bookingsTillLastWeek visitors
## 1  2016-01-01 air_04341b588bde96cd                    0       10
## 2  2016-01-01 air_70e9e8cd55879414                    0        4
## 3  2016-01-01 air_7cc17a324ae5c7dc                    0        2
## 4  2016-01-01 air_81c5dff692063446                    0        7
## 5  2016-01-01 air_877f79706adbfb06                    0        3
## 6  2016-01-01 air_87f9e1024b951f01                    0       17
## 7  2016-01-01 air_883ca28ef0ed3d55                    0        4
## 8  2016-01-01 air_9d452a881f7f2bb7                    0        1
## 9  2016-01-01 air_a083834e7ffe187e                    0       27
## 10 2016-01-01 air_cb7467aed805e7fe                    0       33
## 11 2016-01-01 air_db4b38ebe7a7ceff                    0       21
## 12 2016-01-01 air_db80363d35f10926                    0        8
## 13 2016-01-01 air_f690c42545146e0a                    0        7
## 14 2016-01-01 air_fe22ef5a9cbef123                    0       21
## 15 2016-01-01 air_05c325d315cc17f5                    0       29
## 16 2016-01-01 air_08ba8cd01b3ba010                    0       11
## 17 2016-01-01 air_09a845d5b5944b01                    0       56
## 18 2016-01-01 air_1f7f8fa557bc0d55                    0        6
## 19 2016-01-01 air_2570ccb93badde68                    0       43
## 20 2016-01-01 air_25d8e5cc57dd87d9                    0        1
## 21 2016-01-01 air_298513175efdf261                    0       12
## 22 2016-01-01 air_35c4732dcbfe31be                    0       16
## 23 2016-01-01 air_36bcf77d3382d36e                    0       34
## 24 2016-01-01 air_39dccf7df20b1c6a                    0       55
## 25 2016-01-01 air_3b6438b125086430                    0       10
## 26 2016-01-01 air_506fe758114df773                    0        4
## 27 2016-01-01 air_536043fcf1a4f8a4                    0       42
## 28 2016-01-01 air_57ed725a1930a5b9                    0       33
## 29 2016-01-01 air_5b6d18c470bbfaf9                    0       21
## 30 2016-01-01 air_5c65468938c07fa5                    0        8
## 31 2016-01-01 air_60a7057184ec7ec7                    0       64
## 32 2016-01-01 air_6e3fd96320d24324                    0       32
## 33 2016-01-01 air_71903025d39a4571                    0        5
## 34 2016-01-01 air_79f528087f49df06                    0       42
## 35 2016-01-01 air_8d50c64692322dff                    0        1
## 36 2016-01-01 air_b2d8bc9c88b85f96                    0       16
## 37 2016-01-01 air_c31472d14e29cee8                    0        3
## 38 2016-01-01 air_c92745dfdd2ec68a                    0       31
## 39 2016-01-01 air_cfdeb326418194ff                    0        4
## 40 2016-01-01 air_d0a1e69685259c92                    0        7
## 41 2016-01-01 air_d0a7bd3339c3d12a                    0       62
## 42 2016-01-01 air_d0e8a085d8dc83aa                    0        8
## 43 2016-01-01 air_d97dabf7aae60da5                    0      102
## 44 2016-01-01 air_e483f5b3c4f310e0                    0        1
## 45 2016-01-01 air_eb2d2653586315dd                    0       16
## 46 2016-01-01 air_efc80d3f96b3aff7                    0       10
## 47 2016-01-01 air_f26f36ec4dc5adb0                    0       64
## 48 2016-01-01 air_fab092c35776a9b1                    0       19
## 49 2016-01-02 air_142e78ba7001da9c                    0        6
## 50 2016-01-02 air_1653a6c513865af3                    0       35
summary(airBV)
##    visit_date                       air_store_id    bookingsTillLastWeek
##  Min.   :2016-01-01   air_640cf4835f0d9ba3:   483   Min.   :   0.0000   
##  1st Qu.:2016-07-23   air_a083834e7ffe187e:   480   1st Qu.:   0.0000   
##  Median :2016-10-24   air_8093d0b565e9dbdf:   479   Median :   0.0000   
##  Mean   :2016-10-13   air_3c05c8f26c611eb9:   478   Mean   :   0.6388   
##  3rd Qu.:2017-01-25   air_6b15edd1b4fbb96a:   478   3rd Qu.:   0.0000   
##  Max.   :2017-05-31   air_5c817ef28f236bdf:   477   Max.   :2232.0000   
##                       (Other)             :250478                       
##     visitors     
##  Min.   :  0.00  
##  1st Qu.:  9.00  
##  Median : 17.00  
##  Mean   : 20.87  
##  3rd Qu.: 29.00  
##  Max.   :877.00  
## 
dim(airBV)
## [1] 253353      4

l{r} library(ggplot2) p <- airBV %>% filter(visitors<400, bookingsTillLastWeek<150) %>% ggplot(aes( bookingsTillLastWeek,visitors)) + geom_point(color = "black", alpha = 0.5) + geom_abline(slope = 1, intercept = 0, color = "grey60") + geom_smooth(method = "lm", color = "blue") p points lower than grey line shows more people booked in advance than visited points upper than grey line shows less people booked one week in advance but visited

Merging bookings, visits, and location information

airBVI=merge(airBV, a_store_addr)
airBVI$visit_date=as.Date(airBVI$visit_date)
head(airBVI)
##           air_store_id visit_date bookingsTillLastWeek visitors
## 1 air_00a91d42b08b08d9 2017-01-21                    0       11
## 2 air_00a91d42b08b08d9 2017-04-01                    0        7
## 3 air_00a91d42b08b08d9 2017-02-21                    0       36
## 4 air_00a91d42b08b08d9 2016-07-21                    0       47
## 5 air_00a91d42b08b08d9 2017-02-24                    0       28
## 6 air_00a91d42b08b08d9 2016-12-22                    0       37
##   air_genre_name                   air_area_name latitude longitude
## 1 Italian/French Tōkyō-to Chiyoda-ku Kudanminami   35.694  139.7536
## 2 Italian/French Tōkyō-to Chiyoda-ku Kudanminami   35.694  139.7536
## 3 Italian/French Tōkyō-to Chiyoda-ku Kudanminami   35.694  139.7536
## 4 Italian/French Tōkyō-to Chiyoda-ku Kudanminami   35.694  139.7536
## 5 Italian/French Tōkyō-to Chiyoda-ku Kudanminami   35.694  139.7536
## 6 Italian/French Tōkyō-to Chiyoda-ku Kudanminami   35.694  139.7536

Some preprocessing on calander before merging it with main date

colnames(cal)[colnames(cal)=="calendar_date"]= "visit_date"
cal$visit_date=as.Date(cal$visit_date)
head(cal)
##   visit_date day_of_week holiday_flg
## 1 2016-01-01      Friday           1
## 2 2016-01-02    Saturday           1
## 3 2016-01-03      Sunday           1
## 4 2016-01-04      Monday           0
## 5 2016-01-05     Tuesday           0
## 6 2016-01-06   Wednesday           0

Merging bookings, visits, and location, calender information order data by dates

airBVIC=merge(airBVI, cal)
head(airBVIC)
##   visit_date         air_store_id bookingsTillLastWeek visitors
## 1 2016-01-01 air_05c325d315cc17f5                    0       29
## 2 2016-01-01 air_f690c42545146e0a                    0        7
## 3 2016-01-01 air_d0a1e69685259c92                    0        7
## 4 2016-01-01 air_5b6d18c470bbfaf9                    0       21
## 5 2016-01-01 air_60a7057184ec7ec7                    0       64
## 6 2016-01-01 air_f26f36ec4dc5adb0                    0       64
##     air_genre_name                         air_area_name latitude
## 1          Izakaya        Fukuoka-ken Fukuoka-shi Daimyō 33.58922
## 2    Japanese food Hokkaidō Sapporo-shi Minami 3 Jōnishi 43.05682
## 3 Creative cuisine        Tōkyō-to Shinjuku-ku Kabukichō 35.69384
## 4      Cafe/Sweets             Tōkyō-to Bunkyō-ku Kasuga 35.70807
## 5          Izakaya       Fukuoka-ken Fukuoka-shi Torikai 33.57569
## 6          Izakaya        Tōkyō-to Shinjuku-ku Kabukichō 35.69384
##   longitude day_of_week holiday_flg
## 1  130.3928      Friday           1
## 2  141.3540      Friday           1
## 3  139.7035      Friday           1
## 4  139.7522      Friday           1
## 5  130.3700      Friday           1
## 6  139.7035      Friday           1
airBVIC =airBVIC[order(airBVIC$visit_date),] 
head(airBVIC)
##   visit_date         air_store_id bookingsTillLastWeek visitors
## 1 2016-01-01 air_05c325d315cc17f5                    0       29
## 2 2016-01-01 air_f690c42545146e0a                    0        7
## 3 2016-01-01 air_d0a1e69685259c92                    0        7
## 4 2016-01-01 air_5b6d18c470bbfaf9                    0       21
## 5 2016-01-01 air_60a7057184ec7ec7                    0       64
## 6 2016-01-01 air_f26f36ec4dc5adb0                    0       64
##     air_genre_name                         air_area_name latitude
## 1          Izakaya        Fukuoka-ken Fukuoka-shi Daimyō 33.58922
## 2    Japanese food Hokkaidō Sapporo-shi Minami 3 Jōnishi 43.05682
## 3 Creative cuisine        Tōkyō-to Shinjuku-ku Kabukichō 35.69384
## 4      Cafe/Sweets             Tōkyō-to Bunkyō-ku Kasuga 35.70807
## 5          Izakaya       Fukuoka-ken Fukuoka-shi Torikai 33.57569
## 6          Izakaya        Tōkyō-to Shinjuku-ku Kabukichō 35.69384
##   longitude day_of_week holiday_flg
## 1  130.3928      Friday           1
## 2  141.3540      Friday           1
## 3  139.7035      Friday           1
## 4  139.7522      Friday           1
## 5  130.3700      Friday           1
## 6  139.7035      Friday           1
tail(airBVIC)
##        visit_date         air_store_id bookingsTillLastWeek visitors
## 253348 2017-05-29 air_db4b38ebe7a7ceff                    6        0
## 253349 2017-05-29 air_12c4fb7a423df20d                    9        0
## 253350 2017-05-30 air_1033310359ceeac1                    8        0
## 253351 2017-05-31 air_877f79706adbfb06                    3        0
## 253352 2017-05-31 air_900d755ebd2f7bbd                   10        0
## 253353 2017-05-31 air_3cad29d1a23209d2                    3        0
##        air_genre_name                air_area_name latitude longitude
## 253348     Dining bar Ōsaka-fu Ōsaka-shi Shinmachi 34.67623  135.4861
## 253349   Western food  Tōkyō-to Shibuya-ku Shibuya 35.66178  139.7041
## 253350        Izakaya     Tōkyō-to Kōtō-ku Tomioka 35.67127  139.7970
## 253351  Japanese food Tōkyō-to Minato-ku Shibakōen 35.65807  139.7516
## 253352 Italian/French       Tōkyō-to Chūō-ku Ginza 35.67211  139.7708
## 253353  Japanese food  Tōkyō-to Shibuya-ku Shibuya 35.66178  139.7041
##        day_of_week holiday_flg
## 253348      Monday           0
## 253349      Monday           0
## 253350     Tuesday           0
## 253351   Wednesday           0
## 253352   Wednesday           0
## 253353   Wednesday           0
dim(airBVIC) 
## [1] 253353     10

Removing air_Area name as it is essentially giving us no more info than what location is giving us

airBVICwoAname=airBVIC[,-6]
head(airBVICwoAname)
##   visit_date         air_store_id bookingsTillLastWeek visitors
## 1 2016-01-01 air_05c325d315cc17f5                    0       29
## 2 2016-01-01 air_f690c42545146e0a                    0        7
## 3 2016-01-01 air_d0a1e69685259c92                    0        7
## 4 2016-01-01 air_5b6d18c470bbfaf9                    0       21
## 5 2016-01-01 air_60a7057184ec7ec7                    0       64
## 6 2016-01-01 air_f26f36ec4dc5adb0                    0       64
##     air_genre_name latitude longitude day_of_week holiday_flg
## 1          Izakaya 33.58922  130.3928      Friday           1
## 2    Japanese food 43.05682  141.3540      Friday           1
## 3 Creative cuisine 35.69384  139.7035      Friday           1
## 4      Cafe/Sweets 35.70807  139.7522      Friday           1
## 5          Izakaya 33.57569  130.3700      Friday           1
## 6          Izakaya 35.69384  139.7035      Friday           1

The restaurant id info is itself very strong indicator if visitors will come or not, that if given, nothing else is more informative.

labels = airBVIC$air_store_id
d=airBVIC
do.pca <- function(dataset,lbls,
                   do.screeplot=F,do.scatter=F,do.biplot=F,do.loadingplot=F) {
  data.pca = prcomp(dataset, scale=TRUE) 
  data.pc = predict(data.pca)
  if (do.screeplot) plot(data.pca, main='screeplot for PCA')
  if (do.scatter) {
    plot(data.pc[,1:2], type="n")
    text(x=data.pc[,1], y=data.pc[,2], labels=lbls)    
  }
  if (do.biplot) biplot(data.pca)
  if (do.loadingplot) {
    plot(data.pca$rotation[,1],type='l')
    #    plot(data.pc[,1],type='l')
  }
  data.pc
}
d$visit_date=as.numeric(d$visit_date)
d$air_store_id = as.numeric(d$air_store_id)
d$air_genre_name = droplevels(d$air_genre_name)
head(d)
##   visit_date air_store_id bookingsTillLastWeek visitors   air_genre_name
## 1      16801          321                    0       29          Izakaya
## 2      16801          302                    0        7    Japanese food
## 3      16801          734                    0        7 Creative cuisine
## 4      16801          478                    0       21      Cafe/Sweets
## 5      16801          493                    0       64          Izakaya
## 6      16801          806                    0       64          Izakaya
##                           air_area_name latitude longitude day_of_week
## 1        Fukuoka-ken Fukuoka-shi Daimyō 33.58922  130.3928      Friday
## 2 Hokkaidō Sapporo-shi Minami 3 Jōnishi 43.05682  141.3540      Friday
## 3        Tōkyō-to Shinjuku-ku Kabukichō 35.69384  139.7035      Friday
## 4             Tōkyō-to Bunkyō-ku Kasuga 35.70807  139.7522      Friday
## 5       Fukuoka-ken Fukuoka-shi Torikai 33.57569  130.3700      Friday
## 6        Tōkyō-to Shinjuku-ku Kabukichō 35.69384  139.7035      Friday
##   holiday_flg
## 1           1
## 2           1
## 3           1
## 4           1
## 5           1
## 6           1
head(airBVIC)
##   visit_date         air_store_id bookingsTillLastWeek visitors
## 1 2016-01-01 air_05c325d315cc17f5                    0       29
## 2 2016-01-01 air_f690c42545146e0a                    0        7
## 3 2016-01-01 air_d0a1e69685259c92                    0        7
## 4 2016-01-01 air_5b6d18c470bbfaf9                    0       21
## 5 2016-01-01 air_60a7057184ec7ec7                    0       64
## 6 2016-01-01 air_f26f36ec4dc5adb0                    0       64
##     air_genre_name                         air_area_name latitude
## 1          Izakaya        Fukuoka-ken Fukuoka-shi Daimyō 33.58922
## 2    Japanese food Hokkaidō Sapporo-shi Minami 3 Jōnishi 43.05682
## 3 Creative cuisine        Tōkyō-to Shinjuku-ku Kabukichō 35.69384
## 4      Cafe/Sweets             Tōkyō-to Bunkyō-ku Kasuga 35.70807
## 5          Izakaya       Fukuoka-ken Fukuoka-shi Torikai 33.57569
## 6          Izakaya        Tōkyō-to Shinjuku-ku Kabukichō 35.69384
##   longitude day_of_week holiday_flg
## 1  130.3928      Friday           1
## 2  141.3540      Friday           1
## 3  139.7035      Friday           1
## 4  139.7522      Friday           1
## 5  130.3700      Friday           1
## 6  139.7035      Friday           1
d=as.data.frame(airBVIC[,-2])
head(d)
##   visit_date bookingsTillLastWeek visitors   air_genre_name
## 1 2016-01-01                    0       29          Izakaya
## 2 2016-01-01                    0        7    Japanese food
## 3 2016-01-01                    0        7 Creative cuisine
## 4 2016-01-01                    0       21      Cafe/Sweets
## 5 2016-01-01                    0       64          Izakaya
## 6 2016-01-01                    0       64          Izakaya
##                           air_area_name latitude longitude day_of_week
## 1        Fukuoka-ken Fukuoka-shi Daimyō 33.58922  130.3928      Friday
## 2 Hokkaidō Sapporo-shi Minami 3 Jōnishi 43.05682  141.3540      Friday
## 3        Tōkyō-to Shinjuku-ku Kabukichō 35.69384  139.7035      Friday
## 4             Tōkyō-to Bunkyō-ku Kasuga 35.70807  139.7522      Friday
## 5       Fukuoka-ken Fukuoka-shi Torikai 33.57569  130.3700      Friday
## 6        Tōkyō-to Shinjuku-ku Kabukichō 35.69384  139.7035      Friday
##   holiday_flg
## 1           1
## 2           1
## 3           1
## 4           1
## 5           1
## 6           1

To apply classification techs, we need to devide o/p in classes

airBVICC=airBVIC
airBVICC$visitors[airBVICC$visitors<17]=0
airBVICC$visitors[airBVICC$visitors>=17]=1
airBVICC$visitors=as.factor(airBVICC$visitors)
head(airBVICC)
##   visit_date         air_store_id bookingsTillLastWeek visitors
## 1 2016-01-01 air_05c325d315cc17f5                    0        1
## 2 2016-01-01 air_f690c42545146e0a                    0        0
## 3 2016-01-01 air_d0a1e69685259c92                    0        0
## 4 2016-01-01 air_5b6d18c470bbfaf9                    0        1
## 5 2016-01-01 air_60a7057184ec7ec7                    0        1
## 6 2016-01-01 air_f26f36ec4dc5adb0                    0        1
##     air_genre_name                         air_area_name latitude
## 1          Izakaya        Fukuoka-ken Fukuoka-shi Daimyō 33.58922
## 2    Japanese food Hokkaidō Sapporo-shi Minami 3 Jōnishi 43.05682
## 3 Creative cuisine        Tōkyō-to Shinjuku-ku Kabukichō 35.69384
## 4      Cafe/Sweets             Tōkyō-to Bunkyō-ku Kasuga 35.70807
## 5          Izakaya       Fukuoka-ken Fukuoka-shi Torikai 33.57569
## 6          Izakaya        Tōkyō-to Shinjuku-ku Kabukichō 35.69384
##   longitude day_of_week holiday_flg
## 1  130.3928      Friday           1
## 2  141.3540      Friday           1
## 3  139.7035      Friday           1
## 4  139.7522      Friday           1
## 5  130.3700      Friday           1
## 6  139.7035      Friday           1

grouping and changing types of some predictors.

Date was needed to combine the data. but date itself doesn’t give much information. The information it can give which is day, holiday, weather, etc. Most of these variables we have in our dataset. But many dates will make it difficult for us to classify. To keep track of weather, the month could also give sufficient information as date, but reduces the computation.So we are converting the date to month after all tables are merged. Surely, it will reduce the accuracy if we had lot of data, but we don’t have very large data to leverage the date information anyway.

air_area_name is not useful as it is giving no more info than waht’s given by location cordinates. It was only useful to match weather and main data.

airc=airBVICC
airc$visit_date=(strftime(airc$visit_date, "%m"))
airc$visit_date=as.numeric(airc$visit_date)
airc$air_store_id = as.numeric(airc$air_store_id)
airc$air_genre_name = droplevels(airc$air_genre_name)
summary(airc)
##    visit_date      air_store_id   bookingsTillLastWeek visitors  
##  Min.   : 1.000   Min.   :  1.0   Min.   :   0.0000    0:124016  
##  1st Qu.: 3.000   1st Qu.:200.0   1st Qu.:   0.0000    1:129337  
##  Median : 7.000   Median :409.0   Median :   0.0000              
##  Mean   : 6.203   Mean   :410.2   Mean   :   0.6388              
##  3rd Qu.:10.000   3rd Qu.:618.0   3rd Qu.:   0.0000              
##  Max.   :12.000   Max.   :829.0   Max.   :2232.0000              
##                                                                  
##         air_genre_name                         air_area_name   
##  Izakaya       :62417   Fukuoka-ken Fukuoka-shi Daimyō: 19861  
##  Cafe/Sweets   :52843   Tōkyō-to Shibuya-ku Shibuya   : 17456  
##  Dining bar    :34319   Tōkyō-to Minato-ku Shibakōen  : 14747  
##  Italian/French:30280   Tōkyō-to Shinjuku-ku Kabukichō: 12548  
##  Bar/Cocktail  :25161   Tōkyō-to Setagaya-ku Setagaya :  8722  
##  Japanese food :18910   Tōkyō-to Chūō-ku Tsukiji      :  8396  
##  (Other)       :29423   (Other)                       :171623  
##     latitude       longitude        day_of_week     holiday_flg     
##  Min.   :33.21   Min.   :130.2   Friday   :40590   Min.   :0.00000  
##  1st Qu.:34.69   1st Qu.:135.3   Monday   :31801   1st Qu.:0.00000  
##  Median :35.66   Median :139.7   Saturday :39489   Median :0.00000  
##  Mean   :35.62   Mean   :137.4   Sunday   :30135   Mean   :0.05133  
##  3rd Qu.:35.69   3rd Qu.:139.8   Thursday :38171   3rd Qu.:0.00000  
##  Max.   :44.02   Max.   :144.3   Tuesday  :36177   Max.   :1.00000  
##                                  Wednesday:36990
airc=airc[,-6]
airc$y=airc$visitors
airc=airc[,-4]
dim(airc)
## [1] 253353      9
head(airc)
##   visit_date air_store_id bookingsTillLastWeek   air_genre_name latitude
## 1          1          321                    0          Izakaya 33.58922
## 2          1          302                    0    Japanese food 43.05682
## 3          1          734                    0 Creative cuisine 35.69384
## 4          1          478                    0      Cafe/Sweets 35.70807
## 5          1          493                    0          Izakaya 33.57569
## 6          1          806                    0          Izakaya 35.69384
##   longitude day_of_week holiday_flg y
## 1  130.3928      Friday           1 1
## 2  141.3540      Friday           1 0
## 3  139.7035      Friday           1 0
## 4  139.7522      Friday           1 1
## 5  130.3700      Friday           1 1
## 6  139.7035      Friday           1 1
aira=airc
head(aira)
##   visit_date air_store_id bookingsTillLastWeek   air_genre_name latitude
## 1          1          321                    0          Izakaya 33.58922
## 2          1          302                    0    Japanese food 43.05682
## 3          1          734                    0 Creative cuisine 35.69384
## 4          1          478                    0      Cafe/Sweets 35.70807
## 5          1          493                    0          Izakaya 33.57569
## 6          1          806                    0          Izakaya 35.69384
##   longitude day_of_week holiday_flg y
## 1  130.3928      Friday           1 1
## 2  141.3540      Friday           1 0
## 3  139.7035      Friday           1 0
## 4  139.7522      Friday           1 1
## 5  130.3700      Friday           1 1
## 6  139.7035      Friday           1 1
aira= model.matrix(~.,data=aira) 
head(aira)
##   (Intercept) visit_date air_store_id bookingsTillLastWeek
## 1           1          1          321                    0
## 2           1          1          302                    0
## 3           1          1          734                    0
## 4           1          1          478                    0
## 5           1          1          493                    0
## 6           1          1          806                    0
##   air_genre_nameCafe/Sweets air_genre_nameCreative cuisine
## 1                         0                              0
## 2                         0                              0
## 3                         0                              1
## 4                         1                              0
## 5                         0                              0
## 6                         0                              0
##   air_genre_nameDining bar air_genre_nameItalian/French
## 1                        0                            0
## 2                        0                            0
## 3                        0                            0
## 4                        0                            0
## 5                        0                            0
## 6                        0                            0
##   air_genre_nameIzakaya air_genre_nameJapanese food
## 1                     1                           0
## 2                     0                           1
## 3                     0                           0
## 4                     0                           0
## 5                     1                           0
## 6                     1                           0
##   air_genre_nameOkonomiyaki/Monja/Teppanyaki air_genre_nameOther
## 1                                          0                   0
## 2                                          0                   0
## 3                                          0                   0
## 4                                          0                   0
## 5                                          0                   0
## 6                                          0                   0
##   air_genre_nameWestern food air_genre_nameYakiniku/Korean food latitude
## 1                          0                                  0 33.58922
## 2                          0                                  0 43.05682
## 3                          0                                  0 35.69384
## 4                          0                                  0 35.70807
## 5                          0                                  0 33.57569
## 6                          0                                  0 35.69384
##   longitude day_of_weekMonday day_of_weekSaturday day_of_weekSunday
## 1  130.3928                 0                   0                 0
## 2  141.3540                 0                   0                 0
## 3  139.7035                 0                   0                 0
## 4  139.7522                 0                   0                 0
## 5  130.3700                 0                   0                 0
## 6  139.7035                 0                   0                 0
##   day_of_weekThursday day_of_weekTuesday day_of_weekWednesday holiday_flg
## 1                   0                  0                    0           1
## 2                   0                  0                    0           1
## 3                   0                  0                    0           1
## 4                   0                  0                    0           1
## 5                   0                  0                    0           1
## 6                   0                  0                    0           1
##   y1
## 1  1
## 2  0
## 3  0
## 4  1
## 5  1
## 6  1
aira<-data.frame(aira)
colnames(aira)[which(names(aira) == "y1")] <- "y"
aira$y=as.numeric(aira$y)
aira=as.data.frame(aira[,-1])

dim(aira)
## [1] 253353     23
head(aira)
##   visit_date air_store_id bookingsTillLastWeek air_genre_nameCafe.Sweets
## 1          1          321                    0                         0
## 2          1          302                    0                         0
## 3          1          734                    0                         0
## 4          1          478                    0                         1
## 5          1          493                    0                         0
## 6          1          806                    0                         0
##   air_genre_nameCreative.cuisine air_genre_nameDining.bar
## 1                              0                        0
## 2                              0                        0
## 3                              1                        0
## 4                              0                        0
## 5                              0                        0
## 6                              0                        0
##   air_genre_nameItalian.French air_genre_nameIzakaya
## 1                            0                     1
## 2                            0                     0
## 3                            0                     0
## 4                            0                     0
## 5                            0                     1
## 6                            0                     1
##   air_genre_nameJapanese.food air_genre_nameOkonomiyaki.Monja.Teppanyaki
## 1                           0                                          0
## 2                           1                                          0
## 3                           0                                          0
## 4                           0                                          0
## 5                           0                                          0
## 6                           0                                          0
##   air_genre_nameOther air_genre_nameWestern.food
## 1                   0                          0
## 2                   0                          0
## 3                   0                          0
## 4                   0                          0
## 5                   0                          0
## 6                   0                          0
##   air_genre_nameYakiniku.Korean.food latitude longitude day_of_weekMonday
## 1                                  0 33.58922  130.3928                 0
## 2                                  0 43.05682  141.3540                 0
## 3                                  0 35.69384  139.7035                 0
## 4                                  0 35.70807  139.7522                 0
## 5                                  0 33.57569  130.3700                 0
## 6                                  0 35.69384  139.7035                 0
##   day_of_weekSaturday day_of_weekSunday day_of_weekThursday
## 1                   0                 0                   0
## 2                   0                 0                   0
## 3                   0                 0                   0
## 4                   0                 0                   0
## 5                   0                 0                   0
## 6                   0                 0                   0
##   day_of_weekTuesday day_of_weekWednesday holiday_flg y
## 1                  0                    0           1 1
## 2                  0                    0           1 0
## 3                  0                    0           1 0
## 4                  0                    0           1 1
## 5                  0                    0           1 1
## 6                  0                    0           1 1
head(aira)
##   visit_date air_store_id bookingsTillLastWeek air_genre_nameCafe.Sweets
## 1          1          321                    0                         0
## 2          1          302                    0                         0
## 3          1          734                    0                         0
## 4          1          478                    0                         1
## 5          1          493                    0                         0
## 6          1          806                    0                         0
##   air_genre_nameCreative.cuisine air_genre_nameDining.bar
## 1                              0                        0
## 2                              0                        0
## 3                              1                        0
## 4                              0                        0
## 5                              0                        0
## 6                              0                        0
##   air_genre_nameItalian.French air_genre_nameIzakaya
## 1                            0                     1
## 2                            0                     0
## 3                            0                     0
## 4                            0                     0
## 5                            0                     1
## 6                            0                     1
##   air_genre_nameJapanese.food air_genre_nameOkonomiyaki.Monja.Teppanyaki
## 1                           0                                          0
## 2                           1                                          0
## 3                           0                                          0
## 4                           0                                          0
## 5                           0                                          0
## 6                           0                                          0
##   air_genre_nameOther air_genre_nameWestern.food
## 1                   0                          0
## 2                   0                          0
## 3                   0                          0
## 4                   0                          0
## 5                   0                          0
## 6                   0                          0
##   air_genre_nameYakiniku.Korean.food latitude longitude day_of_weekMonday
## 1                                  0 33.58922  130.3928                 0
## 2                                  0 43.05682  141.3540                 0
## 3                                  0 35.69384  139.7035                 0
## 4                                  0 35.70807  139.7522                 0
## 5                                  0 33.57569  130.3700                 0
## 6                                  0 35.69384  139.7035                 0
##   day_of_weekSaturday day_of_weekSunday day_of_weekThursday
## 1                   0                 0                   0
## 2                   0                 0                   0
## 3                   0                 0                   0
## 4                   0                 0                   0
## 5                   0                 0                   0
## 6                   0                 0                   0
##   day_of_weekTuesday day_of_weekWednesday holiday_flg y
## 1                  0                    0           1 1
## 2                  0                    0           1 0
## 3                  0                    0           1 0
## 4                  0                    0           1 1
## 5                  0                    0           1 1
## 6                  0                    0           1 1
dim(aira)
## [1] 253353     23
summary(aira[1:30000,])
##    visit_date     air_store_id   bookingsTillLastWeek
##  Min.   :1.000   Min.   :  4.0   Min.   : 0.0000     
##  1st Qu.:2.000   1st Qu.:184.0   1st Qu.: 0.0000     
##  Median :3.000   Median :408.0   Median : 0.0000     
##  Mean   :2.485   Mean   :404.3   Mean   : 0.5126     
##  3rd Qu.:3.000   3rd Qu.:608.0   3rd Qu.: 0.0000     
##  Max.   :4.000   Max.   :826.0   Max.   :79.0000     
##  air_genre_nameCafe.Sweets air_genre_nameCreative.cuisine
##  Min.   :0.0000            Min.   :0.00000               
##  1st Qu.:0.0000            1st Qu.:0.00000               
##  Median :0.0000            Median :0.00000               
##  Mean   :0.2006            Mean   :0.01327               
##  3rd Qu.:0.0000            3rd Qu.:0.00000               
##  Max.   :1.0000            Max.   :1.00000               
##  air_genre_nameDining.bar air_genre_nameItalian.French
##  Min.   :0.0000           Min.   :0.0000              
##  1st Qu.:0.0000           1st Qu.:0.0000              
##  Median :0.0000           Median :0.0000              
##  Mean   :0.1266           Mean   :0.1158              
##  3rd Qu.:0.0000           3rd Qu.:0.0000              
##  Max.   :1.0000           Max.   :1.0000              
##  air_genre_nameIzakaya air_genre_nameJapanese.food
##  Min.   :0.0000        Min.   :0.000              
##  1st Qu.:0.0000        1st Qu.:0.000              
##  Median :0.0000        Median :0.000              
##  Mean   :0.2695        Mean   :0.079              
##  3rd Qu.:1.0000        3rd Qu.:0.000              
##  Max.   :1.0000        Max.   :1.000              
##  air_genre_nameOkonomiyaki.Monja.Teppanyaki air_genre_nameOther
##  Min.   :0.0000                             Min.   :0.00000    
##  1st Qu.:0.0000                             1st Qu.:0.00000    
##  Median :0.0000                             Median :0.00000    
##  Mean   :0.0134                             Mean   :0.03583    
##  3rd Qu.:0.0000                             3rd Qu.:0.00000    
##  Max.   :1.0000                             Max.   :1.00000    
##  air_genre_nameWestern.food air_genre_nameYakiniku.Korean.food
##  Min.   :0.00000            Min.   :0.0000                    
##  1st Qu.:0.00000            1st Qu.:0.0000                    
##  Median :0.00000            Median :0.0000                    
##  Mean   :0.01703            Mean   :0.0243                    
##  3rd Qu.:0.00000            3rd Qu.:0.0000                    
##  Max.   :1.00000            Max.   :1.0000                    
##     latitude       longitude     day_of_weekMonday day_of_weekSaturday
##  Min.   :33.21   Min.   :130.2   Min.   :0.0000    Min.   :0.0000     
##  1st Qu.:34.68   1st Qu.:135.3   1st Qu.:0.0000    1st Qu.:0.0000     
##  Median :35.66   Median :139.7   Median :0.0000    Median :0.0000     
##  Mean   :35.51   Mean   :137.2   Mean   :0.1281    Mean   :0.1558     
##  3rd Qu.:35.69   3rd Qu.:139.8   3rd Qu.:0.0000    3rd Qu.:0.0000     
##  Max.   :44.02   Max.   :144.3   Max.   :1.0000    Max.   :1.0000     
##  day_of_weekSunday day_of_weekThursday day_of_weekTuesday
##  Min.   :0.0000    Min.   :0.000       Min.   :0.000     
##  1st Qu.:0.0000    1st Qu.:0.000       1st Qu.:0.000     
##  Median :0.0000    Median :0.000       Median :0.000     
##  Mean   :0.1192    Mean   :0.148       Mean   :0.146     
##  3rd Qu.:0.0000    3rd Qu.:0.000       3rd Qu.:0.000     
##  Max.   :1.0000    Max.   :1.000       Max.   :1.000     
##  day_of_weekWednesday  holiday_flg            y         
##  Min.   :0.000        Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.000        1st Qu.:0.00000   1st Qu.:0.0000  
##  Median :0.000        Median :0.00000   Median :1.0000  
##  Mean   :0.145        Mean   :0.03673   Mean   :0.5327  
##  3rd Qu.:0.000        3rd Qu.:0.00000   3rd Qu.:1.0000  
##  Max.   :1.000        Max.   :1.00000   Max.   :1.0000
airs=airc
colnames(airs)[which(names(airs) == "visitors")] <- "y"
head(airs)
##   visit_date air_store_id bookingsTillLastWeek   air_genre_name latitude
## 1          1          321                    0          Izakaya 33.58922
## 2          1          302                    0    Japanese food 43.05682
## 3          1          734                    0 Creative cuisine 35.69384
## 4          1          478                    0      Cafe/Sweets 35.70807
## 5          1          493                    0          Izakaya 33.57569
## 6          1          806                    0          Izakaya 35.69384
##   longitude day_of_week holiday_flg y
## 1  130.3928      Friday           1 1
## 2  141.3540      Friday           1 0
## 3  139.7035      Friday           1 0
## 4  139.7522      Friday           1 1
## 5  130.3700      Friday           1 1
## 6  139.7035      Friday           1 1
airs$y=as.numeric(airs$y)
#res=my.classifier(aira[1:50000,], cl.name='knn',do.cv=F)

we need to convert categorical predictors into numeric predictors for KNN.

#summary(airBVICC$bookingsTillLastWeek)
airBVICCN=airBVICC[,c("visit_date", "air_store_id", "bookingsTillLastWeek","air_genre_name", "air_area_name", "latitude", "longitude", "day_of_week", "holiday_flg", "visitors")]
airBVICCN$visit_date = as.numeric(airBVICCN$visit_date)
airBVICCN$air_store_id = as.numeric(airBVICCN$air_store_id)
airBVICCN$air_genre_name = as.numeric(airBVICCN$air_genre_name)
airBVICCN$air_area_name = as.numeric(airBVICCN$air_area_name)
airBVICCN$day_of_week = as.numeric(airBVICCN$day_of_week)
head(airBVICCN)
##   visit_date air_store_id bookingsTillLastWeek air_genre_name
## 1      16801          321                    0              6
## 2      16801          302                    0              7
## 3      16801          734                    0              3
## 4      16801          478                    0              2
## 5      16801          493                    0              6
## 6      16801          806                    0              6
##   air_area_name latitude longitude day_of_week holiday_flg visitors
## 1             1 33.58922  130.3928           1           1        1
## 2            24 43.05682  141.3540           1           1        0
## 3            92 35.69384  139.7035           1           1        0
## 4            54 35.70807  139.7522           1           1        1
## 5             8 33.57569  130.3700           1           1        1
## 6            92 35.69384  139.7035           1           1        1
summary(airBVICCN)
##    visit_date     air_store_id   bookingsTillLastWeek air_genre_name  
##  Min.   :16801   Min.   :  1.0   Min.   :   0.0000    Min.   : 1.000  
##  1st Qu.:17005   1st Qu.:200.0   1st Qu.:   0.0000    1st Qu.: 2.000  
##  Median :17098   Median :409.0   Median :   0.0000    Median : 5.000  
##  Mean   :17087   Mean   :410.2   Mean   :   0.6388    Mean   : 4.671  
##  3rd Qu.:17191   3rd Qu.:618.0   3rd Qu.:   0.0000    3rd Qu.: 6.000  
##  Max.   :17317   Max.   :829.0   Max.   :2232.0000    Max.   :11.000  
##  air_area_name       latitude       longitude      day_of_week   
##  Min.   :  1.00   Min.   :33.21   Min.   :130.2   Min.   :1.000  
##  1st Qu.: 25.00   1st Qu.:34.69   1st Qu.:135.3   1st Qu.:2.000  
##  Median : 57.00   Median :35.66   Median :139.7   Median :4.000  
##  Mean   : 52.94   Mean   :35.62   Mean   :137.4   Mean   :3.987  
##  3rd Qu.: 85.00   3rd Qu.:35.69   3rd Qu.:139.8   3rd Qu.:6.000  
##  Max.   :103.00   Max.   :44.02   Max.   :144.3   Max.   :7.000  
##   holiday_flg      visitors  
##  Min.   :0.00000   0:124016  
##  1st Qu.:0.00000   1:129337  
##  Median :0.00000             
##  Mean   :0.05133             
##  3rd Qu.:0.00000             
##  Max.   :1.00000
## standardize/normalize data 
airs=airBVICCN
airs[1:9] <- as.data.frame(scale(airBVICCN[,1:9]))
summary(airs)
##    visit_date        air_store_id       bookingsTillLastWeek
##  Min.   :-2.31847   Min.   :-1.704132   Min.   : -0.08116   
##  1st Qu.:-0.66690   1st Qu.:-0.875300   1st Qu.: -0.08116   
##  Median : 0.08603   Median :-0.004817   Median : -0.08116   
##  Mean   : 0.00000   Mean   : 0.000000   Mean   :  0.00000   
##  3rd Qu.: 0.83895   3rd Qu.: 0.865665   3rd Qu.: -0.08116   
##  Max.   : 1.85903   Max.   : 1.744478   Max.   :283.46359   
##  air_genre_name    air_area_name        latitude          longitude      
##  Min.   :-1.4568   Min.   :-1.6160   Min.   :-1.17336   Min.   :-1.9507  
##  1st Qu.:-1.0600   1st Qu.:-0.8693   1st Qu.:-0.45101   1st Qu.:-0.5700  
##  Median : 0.1304   Median : 0.1263   Median : 0.02023   Median : 0.6296  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.00000   Mean   : 0.0000  
##  3rd Qu.: 0.5272   3rd Qu.: 0.9975   3rd Qu.: 0.03776   3rd Qu.: 0.6518  
##  Max.   : 2.5112   Max.   : 1.5575   Max.   : 4.10077   Max.   : 1.8833  
##   day_of_week         holiday_flg      visitors  
##  Min.   :-1.468681   Min.   :-0.2326   0:124016  
##  1st Qu.:-0.976943   1st Qu.:-0.2326   1:129337  
##  Median : 0.006533   Median :-0.2326             
##  Mean   : 0.000000   Mean   : 0.0000             
##  3rd Qu.: 0.990009   3rd Qu.:-0.2326             
##  Max.   : 1.481748   Max.   : 4.2991
set.seed(12345) # set the seed so you can get exactly the same results whenever you run the code
do.classification <- function(train.set, test.set, 
                              cl.name, verbose=F) {
  ## note: to plot ROC later, we want the raw probabilities,
  ## not binary decisions
  switch(cl.name,
         knn = { # here we test k=1; you should evaluate different k's
           prob = knn(train.set[,-1], test.set[,-1], cl=train.set[,1], k = 1, prob=T)
           attr(prob,"prob")[prob==0] = 1-attr(prob,"prob")[prob==0] #modified
           prob = attr(prob,"prob")
           #print(cbind(prob,as.character(test.set$y)))
           prob
         },
         knn3 = {
           prob = knn(train.set[,-1], test.set[,-1], cl=train.set[,1], k = 3, prob=T)
           attr(prob,"prob")[prob==0] = 1-attr(prob,"prob")[prob==0] #modified
           prob = attr(prob,"prob")
           #print(cbind(prob,as.character(test.set$y)))
           prob
         },
         knn5 = { 
           prob = knn(train.set[,-1], test.set[,-1], cl=train.set[,1], k = 5, prob=T)
           attr(prob,"prob")[prob==0] = 1-attr(prob,"prob")[prob==0] #modified
           prob = attr(prob,"prob")
           #print(cbind(prob,as.character(test.set$y)))
           prob
         },
         knn10 = { 
           prob = knn(train.set[,-1], test.set[,-1], cl=train.set[,1], k = 10, prob=T)
           attr(prob,"prob")[prob==0] = 1-attr(prob,"prob")[prob==0] #modified
           prob = attr(prob,"prob")
           #print(cbind(prob,as.character(test.set$y)))
           prob
         },
         lr = { # logistic regression
           model = glm(y~., family=binomial(link="logit"), data=train.set)
           if (verbose) {
             print(summary(model))             
           }
           prob = predict(model, newdata=test.set, type="response") 
           #print(cbind(prob,as.character(test.set$y)))
           prob
         },
         nb = { # naive bayes
           model = naiveBayes(y~., data=train.set)
           prob = predict(model, newdata=test.set, type="raw") 
           #print(cbind(prob,as.character(test.set$y)))
           
           prob = prob[,2]/rowSums(prob) # renormalize the prob.
           prob
         },
      
            dtree = {
           model = rpart(y~., data=train.set, method="anova")
           prob = predict(model, newdata=test.set)
         },
         dtree2 = {
           model = tree(y~.,data=as.data.frame(train.set),mindev=0.02, mincut=3)
           prob = predict(model, newdata=test.set)
          },
         dtree3 = {
           model = dtree(y~.,data=as.data.frame(train.set),methods=c("lm"), tuneLength=3)
           prob = predict(model, newdata=as.data.frame(test.set))
           plot(model)
          text(model,digits=2)
             # printcp(model)
             #  plot(model, uniform=TRUE, main="Classification Tree")
             # text(model, use.n=TRUE, all=TRUE, cex=.8)
          },
         dtreeprune = {
           #model = rpart(y~., data=train.set)
           model <- rpart(y~., data=train.set,control =rpart.control(minsplit=10,minbucket=3, cp=0.001,maxdepth=10))
        
           prob = predict(model, newdata=test.set)
           
           if (1) { # here we prune the tree
             ## prune the tree 
             pfit<- prune(model, cp=model$cptable[which.min(model$cptable[,"xerror"]),"CP"])
             prob = predict(pfit, newdata=test.set)
             ## plot the pruned tree 
             plot(pfit, uniform=TRUE,main="Pruned Classification Tree")
             text(pfit, use.n=TRUE, all=TRUE, cex=.8)             
           }
           #print(cbind(prob,as.character(test.set$y)))
          # renormalize the prob.
           prob
         },
         
        
         ada = {
           model = ada(y~., data = train.set)
           prob = predict(model, newdata=test.set, type='probs')
           #print(cbind(prob,as.character(test.set$y)))
           prob = prob[,2]/rowSums(prob)
           prob
         }
  ) 
}

To do time series cross validation

timeseries.cv <- function(dataset, cl.name, t.fold=16, get.performance=T,prob.cutoff=0.5, val=TRUE) {
  n=t.fold-3
  n.obs <- nrow(dataset) # no. of observations 
  ss=n.obs/t.fold
  errors = dim(t.fold)
  precisions = dim(t.fold)
  recalls = dim(t.fold)
  fscores = dim(t.fold)
  accuracies = dim(t.fold)
  probs = NULL
  actuals = NULL
  print(cl.name)
  for (t in 1:n) {
    ltrain=(t-1)*ss+1
    utrain=((t+1)*ss)-1
    lval=(t+1)*ss
    uval=((t+2)*ss)-1
    ltest=(t+2)*ss
    utest=(t+3)*ss-1
    train.set = dataset[ltrain:utrain,]
      val.set=dataset[lval:uval,]
      test.set = dataset[ltest:utest,]
  if(val==FALSE)
    {
    val.set= test.set
    }
    #cat(t.fold,'-fold CV run',t,cl.name,':', '#training:',nrow(train.set),'#val:',nrow(val.set),'#testing',nrow(test.set),'\n')
    prob = do.classification(train.set, val.set, cl.name)
    predicted = as.numeric(prob > prob.cutoff)
    actual = val.set$y
    predicted=factor(predicted,levels=c(0,1))
    confusion.matrix = table(actual,predicted)
    confusion.matrix
    error = (confusion.matrix[1,2]+confusion.matrix[2,1]) / nrow(test.set)
    errors[t] = error
    #cat('\t\terror=',error,'\n')
    precision = (confusion.matrix[2,2]/(confusion.matrix[2,2]+confusion.matrix[1,2]+0.0001)) # so that denominator does't become 0
    precisions[t] = precision
    #print(confusion.matrix)
    recall =(confusion.matrix[2,2]/(confusion.matrix[2,2]+confusion.matrix[2,1]+0.0001))
    recalls[t] = recall
    fscore = 2*precision*recall/(precision+recall+0.000001)
    fscores[t] = fscore
    probs = c(probs,prob)
    actuals = c(actuals,actual)
  }
  avg.error = mean(errors)
  cat('avg error=',avg.error,'\n')
  avg.accuracy = 1 - avg.error
  cat('avg Accuracy=',avg.accuracy,'\n')
  avg.precision = mean(precisions)
  cat('avg Precision=',avg.precision,'\n')
  avg.recall = mean(recalls)
  cat('avg recall=',avg.recall,'\n')
  avg.fscore = mean(fscores)
  cat('avg fscore=',avg.fscore,'\n')
 
}
# 
# # (1)-1 evaluate the 10-fold cv results in a performance table
# colnames(airs)[which(names(airs) == "visitors")] <- "y"
# head(airs)
# 
# results1 <- cbind(my.classifier(airc, cl.name='lr',do.cv=F),
#                  my.classifier(airs, cl.name='knn',do.cv=F), # use dataset_num for kNN
#                  my.classifier(airc, cl.name='nb',do.cv=F),
#                  my.classifier(airc, cl.name='dtree',do.cv=F),
#                  my.classifier(airc, cl.name='svm',do.cv=F),
#                  my.classifier(airc, cl.name='ada',do.cv=F)
# )
head(aira)
##   visit_date air_store_id bookingsTillLastWeek air_genre_nameCafe.Sweets
## 1          1          321                    0                         0
## 2          1          302                    0                         0
## 3          1          734                    0                         0
## 4          1          478                    0                         1
## 5          1          493                    0                         0
## 6          1          806                    0                         0
##   air_genre_nameCreative.cuisine air_genre_nameDining.bar
## 1                              0                        0
## 2                              0                        0
## 3                              1                        0
## 4                              0                        0
## 5                              0                        0
## 6                              0                        0
##   air_genre_nameItalian.French air_genre_nameIzakaya
## 1                            0                     1
## 2                            0                     0
## 3                            0                     0
## 4                            0                     0
## 5                            0                     1
## 6                            0                     1
##   air_genre_nameJapanese.food air_genre_nameOkonomiyaki.Monja.Teppanyaki
## 1                           0                                          0
## 2                           1                                          0
## 3                           0                                          0
## 4                           0                                          0
## 5                           0                                          0
## 6                           0                                          0
##   air_genre_nameOther air_genre_nameWestern.food
## 1                   0                          0
## 2                   0                          0
## 3                   0                          0
## 4                   0                          0
## 5                   0                          0
## 6                   0                          0
##   air_genre_nameYakiniku.Korean.food latitude longitude day_of_weekMonday
## 1                                  0 33.58922  130.3928                 0
## 2                                  0 43.05682  141.3540                 0
## 3                                  0 35.69384  139.7035                 0
## 4                                  0 35.70807  139.7522                 0
## 5                                  0 33.57569  130.3700                 0
## 6                                  0 35.69384  139.7035                 0
##   day_of_weekSaturday day_of_weekSunday day_of_weekThursday
## 1                   0                 0                   0
## 2                   0                 0                   0
## 3                   0                 0                   0
## 4                   0                 0                   0
## 5                   0                 0                   0
## 6                   0                 0                   0
##   day_of_weekTuesday day_of_weekWednesday holiday_flg y
## 1                  0                    0           1 1
## 2                  0                    0           1 0
## 3                  0                    0           1 0
## 4                  0                    0           1 1
## 5                  0                    0           1 1
## 6                  0                    0           1 1
timeseries.cv(aira, cl.name='dtree2', t.fold=16, get.performance=T, val=TRUE) 
## [1] "dtree2"
## avg error= 0.5035221 
## avg Accuracy= 0.4964779 
## avg Precision= 0.4248866 
## avg recall= 0.4803596 
## avg fscore= 0.3406344

So, our results are very poor when we used all the variables. As bas as a random guess.

airarid=aira[,-2]
head(airarid)
##   visit_date bookingsTillLastWeek air_genre_nameCafe.Sweets
## 1          1                    0                         0
## 2          1                    0                         0
## 3          1                    0                         0
## 4          1                    0                         1
## 5          1                    0                         0
## 6          1                    0                         0
##   air_genre_nameCreative.cuisine air_genre_nameDining.bar
## 1                              0                        0
## 2                              0                        0
## 3                              1                        0
## 4                              0                        0
## 5                              0                        0
## 6                              0                        0
##   air_genre_nameItalian.French air_genre_nameIzakaya
## 1                            0                     1
## 2                            0                     0
## 3                            0                     0
## 4                            0                     0
## 5                            0                     1
## 6                            0                     1
##   air_genre_nameJapanese.food air_genre_nameOkonomiyaki.Monja.Teppanyaki
## 1                           0                                          0
## 2                           1                                          0
## 3                           0                                          0
## 4                           0                                          0
## 5                           0                                          0
## 6                           0                                          0
##   air_genre_nameOther air_genre_nameWestern.food
## 1                   0                          0
## 2                   0                          0
## 3                   0                          0
## 4                   0                          0
## 5                   0                          0
## 6                   0                          0
##   air_genre_nameYakiniku.Korean.food latitude longitude day_of_weekMonday
## 1                                  0 33.58922  130.3928                 0
## 2                                  0 43.05682  141.3540                 0
## 3                                  0 35.69384  139.7035                 0
## 4                                  0 35.70807  139.7522                 0
## 5                                  0 33.57569  130.3700                 0
## 6                                  0 35.69384  139.7035                 0
##   day_of_weekSaturday day_of_weekSunday day_of_weekThursday
## 1                   0                 0                   0
## 2                   0                 0                   0
## 3                   0                 0                   0
## 4                   0                 0                   0
## 5                   0                 0                   0
## 6                   0                 0                   0
##   day_of_weekTuesday day_of_weekWednesday holiday_flg y
## 1                  0                    0           1 1
## 2                  0                    0           1 0
## 3                  0                    0           1 0
## 4                  0                    0           1 1
## 5                  0                    0           1 1
## 6                  0                    0           1 1
timeseries.cv(airarid, cl.name='dtree2', t.fold=16, get.performance=T) 
## [1] "dtree2"
## avg error= 0.5035221 
## avg Accuracy= 0.4964779 
## avg Precision= 0.4248866 
## avg recall= 0.4803596 
## avg fscore= 0.3406344

results are bad even if we remove the store ids. As bas as a random guess.

head(aira)
##   visit_date air_store_id bookingsTillLastWeek air_genre_nameCafe.Sweets
## 1          1          321                    0                         0
## 2          1          302                    0                         0
## 3          1          734                    0                         0
## 4          1          478                    0                         1
## 5          1          493                    0                         0
## 6          1          806                    0                         0
##   air_genre_nameCreative.cuisine air_genre_nameDining.bar
## 1                              0                        0
## 2                              0                        0
## 3                              1                        0
## 4                              0                        0
## 5                              0                        0
## 6                              0                        0
##   air_genre_nameItalian.French air_genre_nameIzakaya
## 1                            0                     1
## 2                            0                     0
## 3                            0                     0
## 4                            0                     0
## 5                            0                     1
## 6                            0                     1
##   air_genre_nameJapanese.food air_genre_nameOkonomiyaki.Monja.Teppanyaki
## 1                           0                                          0
## 2                           1                                          0
## 3                           0                                          0
## 4                           0                                          0
## 5                           0                                          0
## 6                           0                                          0
##   air_genre_nameOther air_genre_nameWestern.food
## 1                   0                          0
## 2                   0                          0
## 3                   0                          0
## 4                   0                          0
## 5                   0                          0
## 6                   0                          0
##   air_genre_nameYakiniku.Korean.food latitude longitude day_of_weekMonday
## 1                                  0 33.58922  130.3928                 0
## 2                                  0 43.05682  141.3540                 0
## 3                                  0 35.69384  139.7035                 0
## 4                                  0 35.70807  139.7522                 0
## 5                                  0 33.57569  130.3700                 0
## 6                                  0 35.69384  139.7035                 0
##   day_of_weekSaturday day_of_weekSunday day_of_weekThursday
## 1                   0                 0                   0
## 2                   0                 0                   0
## 3                   0                 0                   0
## 4                   0                 0                   0
## 5                   0                 0                   0
## 6                   0                 0                   0
##   day_of_weekTuesday day_of_weekWednesday holiday_flg y
## 1                  0                    0           1 1
## 2                  0                    0           1 0
## 3                  0                    0           1 0
## 4                  0                    0           1 1
## 5                  0                    0           1 1
## 6                  0                    0           1 1
timeseries.cv(aira[-1], cl.name='dtree2', t.fold=16, get.performance=T) 
## [1] "dtree2"
## avg error= 0.5035221 
## avg Accuracy= 0.4964779 
## avg Precision= 0.4248866 
## avg recall= 0.4803596 
## avg fscore= 0.3406344

Results are still bad if we remove date information. As bas as a random guess.

head(aira)
##   visit_date air_store_id bookingsTillLastWeek air_genre_nameCafe.Sweets
## 1          1          321                    0                         0
## 2          1          302                    0                         0
## 3          1          734                    0                         0
## 4          1          478                    0                         1
## 5          1          493                    0                         0
## 6          1          806                    0                         0
##   air_genre_nameCreative.cuisine air_genre_nameDining.bar
## 1                              0                        0
## 2                              0                        0
## 3                              1                        0
## 4                              0                        0
## 5                              0                        0
## 6                              0                        0
##   air_genre_nameItalian.French air_genre_nameIzakaya
## 1                            0                     1
## 2                            0                     0
## 3                            0                     0
## 4                            0                     0
## 5                            0                     1
## 6                            0                     1
##   air_genre_nameJapanese.food air_genre_nameOkonomiyaki.Monja.Teppanyaki
## 1                           0                                          0
## 2                           1                                          0
## 3                           0                                          0
## 4                           0                                          0
## 5                           0                                          0
## 6                           0                                          0
##   air_genre_nameOther air_genre_nameWestern.food
## 1                   0                          0
## 2                   0                          0
## 3                   0                          0
## 4                   0                          0
## 5                   0                          0
## 6                   0                          0
##   air_genre_nameYakiniku.Korean.food latitude longitude day_of_weekMonday
## 1                                  0 33.58922  130.3928                 0
## 2                                  0 43.05682  141.3540                 0
## 3                                  0 35.69384  139.7035                 0
## 4                                  0 35.70807  139.7522                 0
## 5                                  0 33.57569  130.3700                 0
## 6                                  0 35.69384  139.7035                 0
##   day_of_weekSaturday day_of_weekSunday day_of_weekThursday
## 1                   0                 0                   0
## 2                   0                 0                   0
## 3                   0                 0                   0
## 4                   0                 0                   0
## 5                   0                 0                   0
## 6                   0                 0                   0
##   day_of_weekTuesday day_of_weekWednesday holiday_flg y
## 1                  0                    0           1 1
## 2                  0                    0           1 0
## 3                  0                    0           1 0
## 4                  0                    0           1 1
## 5                  0                    0           1 1
## 6                  0                    0           1 1
typeof(aira)
## [1] "list"
data = subset(aira[, c("visit_date", "y")])
timeseries.cv(data, cl.name='dtree2', t.fold=16, get.performance=T) 
## [1] "dtree2"
## avg error= 0.5048192 
## avg Accuracy= 0.4951808 
## avg Precision= 0.3097959 
## avg recall= 0.6153846 
## avg fscore= 0.4118524

It’s still pretty bad. As bas as a random guess.

all.classifiers <- function(dataset) {
  timeseries.cv(dataset, cl.name='dtree2', t.fold=16, get.performance=T)
timeseries.cv(dataset, cl.name='dtree', t.fold=16, get.performance=T)
timeseries.cv(dataset, cl.name='lr', t.fold=16, get.performance=T)
timeseries.cv(dataset, cl.name='nb', t.fold=16, get.performance=T)
}
# results1 <- cbind(timeseries.cv(aira, cl.name='dtree2', t.fold=16, get.performance=T) ,
#                 my.classifier(airs, cl.name='knn',t.fold=16, do.cv=T),
#                 timeseries.cv(aira, cl.name='dtree', t.fold=16, get.performance=T) ,
#                timeseries.cv(aira, cl.name='lr', t.fold=16, get.performance=T) ,
#                 timeseries.cv(aira, cl.name='nb', t.fold=16, get.performance=T) )
airss=airs
colnames(airss)[which(names(airss) == "visitors")] <- "y"
head(airss)
##   visit_date air_store_id bookingsTillLastWeek air_genre_name
## 1  -2.318467   -0.3713360          -0.08115527      0.5272422
## 2  -2.318467   -0.4504708          -0.08115527      0.9240411
## 3  -2.318467    1.3488041          -0.08115527     -0.6631546
## 4  -2.318467    0.2825671          -0.08115527     -1.0599535
## 5  -2.318467    0.3450419          -0.08115527      0.5272422
## 6  -2.318467    1.6486832          -0.08115527      0.5272422
##   air_area_name    latitude longitude day_of_week holiday_flg y
## 1   -1.61600084 -0.98927786 -1.897016   -1.468681    4.299142 1
## 2   -0.90040186  3.63047062  1.088242   -1.468681    4.299142 0
## 3    1.21528207  0.03768064  0.638755   -1.468681    4.299142 0
## 4    0.03298811  0.04462304  0.651996   -1.468681    4.299142 1
## 5   -1.39820985 -0.99587958 -1.903236   -1.468681    4.299142 1
## 6    1.21528207  0.03768064  0.638755   -1.468681    4.299142 1
#all.classifiers(aira) 
# 
# timeseries.cv(aira, cl.name='dtreeprune', t.fold=32, get.performance=T) 
# timeseries.cv(airss, cl.name='dtreeprune', t.fold=16, get.performance=T) 

# model <- rpart(y~., data=aira[1:10000,],control =rpart.control(minsplit=10,minbucket=3, cp=0,maxdepth=10))
# prob = predict(model, newdata=aira[10000:15000,])
#  
# model$cptable
# 
# pfit<- prune(model, cp=0.01)
#     plot(model)
#         x=which.min(model$cptable[,"xerror"])
# print(x)
# x=which.min(model$cptable[,"xerror"],"CP")
    
#head(airs)
#do.classification(airc[1:10000,], aira[10000:15000,], 'nb')

Maybe because we are trying to use one model fit all stategy on all the restaurants, results are coming really bad. As we saw every restaurnt is very significant to the results before, let us try single model for different restaurnt

Taking only one restaurant information, only it’s date as in 365 days

ndf <- a_visited[a_visited$air_store_id == "air_36bcf77d3382d36e",]
head(ndf)
##                air_store_id visit_date visitors
## 204402 air_36bcf77d3382d36e 2016-01-01       34
## 204403 air_36bcf77d3382d36e 2016-01-02       43
## 204404 air_36bcf77d3382d36e 2016-01-03       30
## 204405 air_36bcf77d3382d36e 2016-01-05       38
## 204406 air_36bcf77d3382d36e 2016-01-06       22
## 204407 air_36bcf77d3382d36e 2016-01-07       18
summary(ndf)
##                air_store_id   visit_date            visitors    
##  air_36bcf77d3382d36e:476   Min.   :2016-01-01   Min.   : 6.00  
##  air_00a91d42b08b08d9:  0   1st Qu.:2016-04-29   1st Qu.:18.00  
##  air_0164b9927d20bcc3:  0   Median :2016-08-26   Median :24.00  
##  air_0241aa3964b7f861:  0   Mean   :2016-08-26   Mean   :31.78  
##  air_0328696196e46f18:  0   3rd Qu.:2016-12-23   3rd Qu.:45.50  
##  air_034a3d5b40d5b1b1:  0   Max.   :2017-04-22   Max.   :99.00  
##  (Other)             :  0
ndf=ndf[,-1]
head(ndf)
##        visit_date visitors
## 204402 2016-01-01       34
## 204403 2016-01-02       43
## 204404 2016-01-03       30
## 204405 2016-01-05       38
## 204406 2016-01-06       22
## 204407 2016-01-07       18
ndf$visit_date=as.numeric(ndf$visit_date)%%365
head(ndf)
##        visit_date visitors
## 204402         11       34
## 204403         12       43
## 204404         13       30
## 204405         15       38
## 204406         16       22
## 204407         17       18
table(ndf$y)
## < table of extent 0 >
ndf1=ndf
colnames(ndf1)[which(names(ndf1) == "visitors")] <- "y"
tail(ndf1)
##        visit_date  y
## 204872        118 24
## 204873        119 13
## 204874        120 30
## 204875        121 19
## 204876        122 21
## 204877        123 50
ndf1$y[ndf1$y<17]=0
ndf1$y[ndf1$y>=17]=1
summary(ndf1$y)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  1.0000  1.0000  0.8277  1.0000  1.0000
ndf1$y=as.numeric(ndf1$y)
head(ndf1)
##        visit_date y
## 204402         11 1
## 204403         12 1
## 204404         13 1
## 204405         15 1
## 204406         16 1
## 204407         17 1
all.classifiers(ndf1)
## [1] "dtree2"
## avg error= 0.2758621 
## avg Accuracy= 0.7241379 
## avg Precision= 0.6976103 
## avg recall= 0.8461503 
## avg fscore= 0.763748 
## [1] "dtree"
## avg error= 0.1750663 
## avg Accuracy= 0.8249337 
## avg Precision= 0.8249308 
## avg recall= 0.9999958 
## avg fscore= 0.9030198 
## [1] "lr"
## avg error= 0.2148541 
## avg Accuracy= 0.7851459 
## avg Precision= 0.820778 
## avg recall= 0.9439481 
## avg fscore= 0.8698828 
## [1] "nb"
## avg error= 0.2519894 
## avg Accuracy= 0.7480106 
## avg Precision= 0.7645651 
## avg recall= 0.8797343 
## avg fscore= 0.8104717
#timeseries.cv(ndf1, cl.name='dtree2', t.fold=16, get.performance=T) 

The results are better. D tree gives the minimum error and best f score So, we choose this as our model and now apply on test set.

timeseries.cv(ndf1, cl.name='dtree', t.fold=16, get.performance=T, val=FALSE) 
## [1] "dtree"
## avg error= 0.1803714 
## avg Accuracy= 0.8196286 
## avg Precision= 0.8196258 
## avg recall= 0.9999958 
## avg fscore= 0.8999587

We report this as our classifcation model results.

Now we also try autoregressive models to predict one week in advance. For this we only choose one particular restaurant that has all time series visitation information

ggplot(ndf, aes(visit_date,visitors)) +geom_line(col = "blue") +labs(y = "All visitors", x = "Date")

Some preprocessing and cleansing( removing outliers)

count_ts = ts(ndf[, c('visitors')])
ndf$clean_cnt = tsclean(count_ts)
ggplot() +geom_line(data = ndf, aes(x = visit_date, y = clean_cnt)) + ylab('!!!Cleaned!!!')
## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.

ndf$cnt_ma = ma(ndf$clean_cnt, order=7) # clean the dataset
ndf$cnt_ma30 = ma(ndf$clean_cnt, order=3)
ggplot() +
  geom_line(data = ndf, aes(x = visit_date, y = clean_cnt, colour = "Counts")) +
  geom_line(data = ndf, aes(x = visit_date, y = cnt_ma,   colour = "Weekly MA"))  +
  geom_line(data = ndf, aes(x = visit_date, y = cnt_ma30, colour = "Three Days MA"))  +
  ylab('Count')
## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.

head(ndf)
##        visit_date visitors clean_cnt   cnt_ma cnt_ma30
## 204402         11       34        34       NA       NA
## 204403         12       43        43       NA 35.66667
## 204404         13       30        30       NA 37.00000
## 204405         15       38        38 29.00000 30.00000
## 204406         16       22        22 30.85714 26.00000
## 204407         17       18        18 31.42857 19.33333
#deseasonalized series
count_ma = ts(na.omit(ndf$cnt_ma30), frequency=7)
decomp = stl(count_ma, s.window="periodic")
deseasonal_cnt <- seasadj(decomp)
plot(decomp)

adf.test(count_ma, alternative = "stationary")
## Warning in adf.test(count_ma, alternative = "stationary"): p-value smaller
## than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  count_ma
## Dickey-Fuller = -8.6512, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
Acf(count_ma, main='')

Pacf(count_ma, main='')

count_d1 = diff(deseasonal_cnt, differences = 1)
plot(count_d1)

adf.test(count_d1, alternative = "stationary")
## Warning in adf.test(count_d1, alternative = "stationary"): p-value smaller
## than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  count_d1
## Dickey-Fuller = -10.736, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
Acf(count_d1, main='ACF for Differenced Series')

Pacf(count_d1, main='PACF for Differenced Series')

auto.arima(deseasonal_cnt, seasonal=FALSE)
## Series: deseasonal_cnt 
## ARIMA(0,0,2) with non-zero mean 
## 
## Coefficients:
##          ma1     ma2     mean
##       1.0262  0.9715  31.7685
## s.e.  0.0156  0.0108   0.5632
## 
## sigma^2 estimated as 16.91:  log likelihood=-1344.29
## AIC=2696.59   AICc=2696.67   BIC=2713.23
fit<-auto.arima(deseasonal_cnt, seasonal=FALSE)
tsdisplay(residuals(fit), lag.max=45, main='(1,1,1) Model Residuals')

fit2 = arima(deseasonal_cnt, order=c(1,1,7))
fit2
## 
## Call:
## arima(x = deseasonal_cnt, order = c(1, 1, 7))
## 
## Coefficients:
##           ar1     ma1     ma2      ma3      ma4     ma5     ma6      ma7
##       -0.1923  0.3782  -0.054  -1.0965  -0.3096  0.0572  0.1003  -0.0756
## s.e.   0.2775  0.2751   0.077   0.0568   0.3102  0.0772  0.0555   0.0547
## 
## sigma^2 estimated as 15.78:  log likelihood = -1331.31,  aic = 2680.63
tsdisplay(residuals(fit2), lag.max=15, main='Seasonal Model Residuals')

fcast <- forecast(fit2, h=7)
plot(fcast)

holdout the last month and start predicting on that. We got ARIMA parameters order from the ACF and PACF plots

hold <- window(ts(deseasonal_cnt), start=440)
fit_no_holdout = arima(ts(deseasonal_cnt[-c(440:470)]), order=c(1,1,7))
fcast_no_holdout <- forecast(fit_no_holdout,h=31)
y_=fcast_no_holdout$mean
test=ndf[440:470,]
y=test$y
y
## NULL
tail(ndf)
##        visit_date visitors clean_cnt   cnt_ma cnt_ma30
## 204872        118       24        24 36.28571 33.00000
## 204873        119       13        13 33.57143 22.33333
## 204874        120       30        30 31.28571 20.66667
## 204875        121       19        19       NA 23.33333
## 204876        122       21        21       NA 30.00000
## 204877        123       50        50       NA       NA
plot(fcast_no_holdout, main=" ")
lines(ts(deseasonal_cnt))

y_[y_<17]=0
y_[y_>=17]=1
y[y<17]=0
y[y>=17]=1
#table(y,y_)

accuracy, precision, f score when regression result is used as classficiation classes.

Regression results:

fit_w_seasonality = auto.arima(deseasonal_cnt, seasonal=TRUE)
fit_w_seasonality
## Series: deseasonal_cnt 
## ARIMA(1,0,1)(1,0,2)[7] with non-zero mean 
## 
## Coefficients:
##          ar1     ma1    sar1     sma1    sma2     mean
##       0.6131  0.2628  0.9124  -0.8922  0.1159  31.5342
## s.e.  0.0424  0.0405  0.0428   0.0627  0.0479   1.7050
## 
## sigma^2 estimated as 24.61:  log likelihood=-1430.22
## AIC=2874.45   AICc=2874.69   BIC=2903.58

Prophet

We used Prophet to predict the future visitation dates Prophet is the library that was developed by Facebook. It uses Additive Regression model. It automatically detects the changes in the trend, find some seasonal patterns

library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
ndf_p <- read.csv("ndf_1.csv")
ndf_p <- ndf_p[, c('visit_date', 'visitors')]
setnames(ndf_p, old=c("visit_date","visitors"), new=c("ds", "y"))
head(ndf_p)
##           ds  y
## 1 2016-01-01 34
## 2 2016-01-02 43
## 3 2016-01-03 30
## 4 2016-01-05 38
## 5 2016-01-06 22
## 6 2016-01-07 18
library(prophet)
## Loading required package: Rcpp
m <- prophet(ndf_p)
## Disabling yearly seasonality. Run prophet with yearly.seasonality=TRUE to override this.
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
## Initial log joint probability = -12.744
## Optimization terminated normally: 
##   Convergence detected: absolute parameter change was below tolerance
future <- make_future_dataframe(m, periods = 31)
forecast <- predict(m, future)
plot(m, forecast)

yhat column contains the forecast including additional columns like uncertainty intervals and seasonal components of the forecast

tail(forecast[c('ds', 'yhat', 'yhat_lower', 'yhat_upper')])
##             ds     yhat yhat_lower yhat_upper
## 502 2017-05-18 22.90804   9.623232   36.38780
## 503 2017-05-19 30.96694  17.196381   45.09868
## 504 2017-05-20 57.42978  42.647437   71.70871
## 505 2017-05-21 51.28849  37.397565   64.73551
## 506 2017-05-22 22.98990   9.327725   38.23588
## 507 2017-05-23 21.47555   6.366852   35.02161
y_=forecast$yhat[440:470]
test=ndf[440:470,]
y=test$y

y_[y_<17]=0
y_[y_>=17]=1
y[y<17]=0
y[y>=17]=1

#table(y,y_)

This component allow us to see the forecast that has been broken into trends, weekly trends. As our data analysis part showed, more visitors go to the restaurants during the weekend (plus on Fridays)

prophet_plot_components(m, forecast)

Conclusions:

Many features seemed to be very relevant for classficiation, but turned out to be not that useful, and date information alone is very good predictor for finding customer traffic.

Future works:

We later realise for the restaurant data we have used, the classes are not balanced i.e. thier median is not 17, and if we run on thier median of the hotel, we are getting bad results with classification but still good results with autoregressive models. The immideate future work needs finetune for classification models. Also, time series cross validation for the autoregressive models is yet to be implimented, like for classification we did. We have worked on weather data a lot but final manual change of names would take a lot of time before combining. We are not left with time, and hence we submit our code here.